#!/usr/bin/perl
my $inputfile='appc_feb08.txt'; # This file was copied and pasted via okular.
# Assumption: does not contain ` characters initially.
my $outputfile='appc_feb08.xml';

# slurp the whole file into $original.
undef $/;
open INFILE,"<",$inputfile;
$modified=<INFILE>; 
close INFILE;

# do some cleanup
$modified =~ s!•\n!!g;
$modified =~ s!•!!g;
$modified =~ s!&!&amp;!g;

# nukes all acute accents. There has to be a better way to do this.
$modified =~ s!á!a!g;
$modified =~ s!é!e!g;
$modified =~ s!í!i!g;
$modified =~ s!ó!o!g;
$modified =~ s!ú!u!g;


# fiddle with the header for each block
$modified = "\n$modified";
$modified =~ s!\n(.*)\nAPPC!\n`<name>$1</name>\nAPPC!g;

# Mark the beginnings of all sections with ***

$modified =~ s!\n(APPC Register Entry)!\n`$1!g;
$modified =~ s!\n(Address\(es\))!\n`$1!g;
$modified =~ s!\n(Website)!\n`$1!g;
$modified =~ s!\n(Staff \(employed)!\n`$1!g;
$modified =~ s!\n(Fee-Paying)!\n`$1!g;
$modified =~ s!\n(Offices)!\n`$1!g;
$modified =~ s!\n(Contact\n)!\n`$1!g;

# take each section and XMLise.

$modified =~ s!\n`APPC Register Entry for (.*) to (.*)!\n`<from>$1</from> <to>$2</to>!g;
$modified =~ s!\n`Address\(es\) in UK\n([^`]*)!\n`<addr>$1</addr>\n!g;
$modified =~ s!\n`Website: (.*)!\n`<web>$1</web>!g;
$modified =~ s!\n`Staff \(employed .*\n([^`]*)!\n`<staff>$1</staff>\n!g;
$modified =~ s!\n`Fee-Paying .* PA .*\n([^`]*)!\n`<paclient>$1</paclient>\n!g;
$modified =~ s!\n`Fee-Paying .* monitoring .*\n([^`]*)!\n`<monclient>$1</monclient>\n!g;
$modified =~ s!\n`Offices .*\n([^`]*)!\n`<offices>$1</offices>\n!g;
$modified =~ s!\n`Contact\n([^`]*)!\n`<contact>\n$1</contact>\n!g;

# remove junk from <contact> ... </contact>

$modified =~ s!\n(?:Name|Fax|Tel|Email):!!g;


# add xml header/footer
$modified="<xml>\n$modified\n</xml>";

# add entry markers
$modified =~ s!`<name>!</entry>\n\n`<entry><name>!g;
$modified =~ s!<xml>\n\n</entry>!<xml>!g;
$modified =~ s!</xml>!</entry></xml>!g;

# remove all markers
$modified =~ s!`!!g;

open OUTFILE,">",$outputfile;
print OUTFILE $modified;
close OUTFILE;